In [15]:
%run notebook.config.ipy


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
set database db: ../csvdb
set database anndb: /gfs/mirror/annotations/mm10_ensembl78/csvdb
set database ipydb: csvdb

In [16]:
count_table_filtered = DB.fetch_DataFrame('''select * from count_table_filtered''', ipydb)

In [17]:
%%R
#additional libraries
library(matrixStats)
library(DESeq2)
library(genefilter)
library(statmod)

(1) Calcuation of size factors


In [18]:
%%R -i count_table_filtered
require(DESeq2)
#print(head(count_table))

dataMouse <- count_table_filtered
rownames(dataMouse) <- dataMouse$gene_id
dataMouse$gene_id <- NULL


geneTypes <- factor( c( ENSM="ENSM", ERCC="ERCC" )[
  substr( rownames(dataMouse), 1, 4 ) ] )

#2. calculate normalisation for counts
countsMmus <- dataMouse[ which( geneTypes=="ENSM" ), ]
countsERCC <- dataMouse[ which( geneTypes=="ERCC" ), ]
#lengthsMmus <- dataMouse[ which( geneTypes=="ENSM" ), 1 ]
#lengthsERCC <- dataMouse[ which( geneTypes=="ERCC" ), 1 ]

sfMmus <- estimateSizeFactorsForMatrix( countsMmus )
sfERCC <- estimateSizeFactorsForMatrix( countsERCC )

In [19]:
%%R
#print(tail(rownames(dataMouse)))
rbind(sfERCC, sfMmus)


          index Scid.recons.p2.A.1 Scid.recons.p2.A.2 Scid.recons.p2.A.3
sfERCC 18.60890          0.2964068          0.6724569          0.2330867
sfMmus 23.82908          0.5159793          1.2156015          1.2560887
       Scid.recons.p2.A.4 Scid.recons.p2.A.5 Scid.recons.p2.A.6
sfERCC          0.8460965          0.6459079          0.6575731
sfMmus          0.6196086          0.8873627          0.8459765
       Scid.recons.p2.A.7 Scid.recons.p2.A.11 Scid.recons.p2.A.12
sfERCC           1.578221           3.6248161            1.626226
sfMmus           2.320326           0.5202626            2.310159
       Scid.recons.p2.B.2 Scid.recons.p2.B.3 Scid.recons.p2.B.5
sfERCC           1.395629          0.9076484           1.660601
sfMmus           1.219161          2.0278077           1.021641
       Scid.recons.p2.B.7 Scid.recons.p2.B.9 Scid.recons.p2.B.10
sfERCC          0.9694452          1.3588822           0.5629712
sfMmus          1.9355318          0.4755612           0.3812385
       Scid.recons.p2.B.11 Scid.recons.p2.B.12 Scid.recons.p2.C.1
sfERCC           0.9074814            1.009646          0.6340091
sfMmus           1.0803029            0.546962          1.4358739
       Scid.recons.p2.C.2 Scid.recons.p2.C.3 Scid.recons.p2.C.5
sfERCC          1.9458103          0.9373297          0.5970074
sfMmus          0.8312255          1.3495394          0.2980555
       Scid.recons.p2.C.6 Scid.recons.p2.C.7 Scid.recons.p2.C.9
sfERCC          0.9870377          0.3839896          0.5983918
sfMmus          0.4688456          0.1123356          1.3241323
       Scid.recons.p2.C.10 Scid.recons.p2.C.12 Scid.recons.p2.D.3
sfERCC           1.2604431            1.586705          1.8196431
sfMmus           0.9949616            2.080499          0.6212494
       Scid.recons.p2.D.4 Scid.recons.p2.D.6 Scid.recons.p2.D.7
sfERCC           1.415337           1.975577          1.5246661
sfMmus           2.003182           1.931854          0.9948052
       Scid.recons.p2.D.8 Scid.recons.p2.D.9 Scid.recons.p2.D.11
sfERCC          0.8964081          2.2449856            1.103550
sfMmus          1.8811722          0.7051518            1.456535
       Scid.recons.p2.D.12 Scid.recons.p2.E.1 Scid.recons.p2.E.2
sfERCC           0.6967537           1.196565           1.040927
sfMmus           0.4411692           2.664196           1.187105
       Scid.recons.p2.E.3 Scid.recons.p2.E.4 Scid.recons.p2.E.7
sfERCC           1.719067           2.117003           1.435559
sfMmus           1.777406           1.232222           1.091381
       Scid.recons.p2.E.9 Scid.recons.p2.E.11 Scid.recons.p2.F.3
sfERCC           0.982840            1.187187           2.113446
sfMmus           0.467996            1.043086           2.139693
       Scid.recons.p2.F.7 Scid.recons.p2.F.8 Scid.recons.p2.F.9
sfERCC          2.3801611           1.163479          1.7533144
sfMmus          0.9928709           2.082284          0.8864964
       Scid.recons.p2.F.11 Scid.recons.p2.G.1 Scid.recons.p2.G.2
sfERCC           1.5613775          0.4478486          0.5915212
sfMmus           0.8834828          1.1631251          1.6851849
       Scid.recons.p2.G.3 Scid.recons.p2.G.7 Scid.recons.p2.G.8
sfERCC          1.3530460          1.0213466           1.077713
sfMmus          0.4726455          0.8196597           1.889788
       Scid.recons.p2.G.9 Scid.recons.p2.G.10 Scid.recons.p2.G.11
sfERCC          0.8222088            0.487510            1.035994
sfMmus          1.2282968            1.066207            1.569215
       Scid.recons.p2.G.12 Scid.recons.p2.H.1 Scid.recons.p2.H.4
sfERCC           1.1333513          1.3819202          0.9861177
sfMmus           0.8548599          0.5788812          2.0218957
       Scid.recons.p2.H.5 Scid.recons.p2.H.6 Scid.recons.p2.H.7
sfERCC          0.5242913          0.3500948          0.7480122
sfMmus          1.0010565          1.3985130          0.7590189
       Scid.recons.p2.H.8 Scid.recons.p2.H.11 Scid.recons.p2.H.12
sfERCC          0.5118485           0.7166089           0.5393113
sfMmus          1.0406231           0.9116648           0.6522041

(2) Normalisation of spikes and biological genes


In [20]:
%%R
#normalise read counts
nCountsERCC <- t( t(countsERCC) / sfERCC )
nCountsMmus <- t( t(countsMmus) / sfMmus )

(3) Calculation of sample moments


In [21]:
%%R
meansERCC <- rowMeans( nCountsERCC )
varsERCC <- rowVars( nCountsERCC )
cv2ERCC <- varsERCC / meansERCC^2
meansMmus <- rowMeans( nCountsMmus )
varsMmus <- rowVars( nCountsMmus )
cv2Mmus <- varsMmus / meansMmus^2

(4) Fit technical noise


In [61]:
%%R
minMeanForFitA <- unname( quantile( meansERCC[ which( cv2ERCC > .3 ) ], 0.8 ) ) #default 0.3, 0.8
useForFitA <- meansERCC >= minMeanForFitA
minMeanForFitA
table( useForFitA )


useForFitA
FALSE  TRUE 
   57    35 

In [62]:
%%R
fitA <- glmgam.fit( cbind( a0 = 1, a1tilde = 1/meansERCC[useForFitA] ),
 cv2ERCC[useForFitA] )

Variance explained by fit


In [63]:
%%R
residualA <- var( log( fitted.values(fitA) ) - log( cv2ERCC[useForFitA] ) )
totalA <- var( log( cv2ERCC[useForFitA] ) )
1 - residualA / totalA


[1] 0.9149221

In [64]:
%%R
plot( meansERCC, cv2ERCC, log="xy", col=1+useForFitA, main="A" )
xg <- 10^seq( -3, 5, length.out=100 )
lines( xg, coefficients(fitA)["a0"] + coefficients(fitA)["a1tilde"]/xg )
segments( meansERCC[useForFitA], cv2ERCC[useForFitA],
 meansERCC[useForFitA], fitA$fitted.values, col="gray" )


(5) Test for high variance


In [65]:
%%R
minBiolDisp <- .5^2
xi <- mean( 1 / sfERCC )
m <- ncol(countsMmus)
psia1thetaA <- mean( 1 / sfERCC ) +
 ( coefficients(fitA)["a1tilde"] - xi ) * mean( sfERCC / sfMmus )
cv2thA <- coefficients(fitA)["a0"] + minBiolDisp + coefficients(fitA)["a0"] * minBiolDisp

testDenomA <- ( meansMmus * psia1thetaA + meansMmus^2 * cv2thA ) / ( 1 + cv2thA/m )
pA <- 1 - pchisq( varsMmus * (m-1) / testDenomA, m-1 )
padjA <- p.adjust( pA, "BH" )
table( padjA < .1 )


FALSE  TRUE 
 6989 15030 

(6) Plot result


In [66]:
%%R

plot( NULL, xaxt="n", yaxt="n",
 log="xy", xlim = c( 1e-1, 3e5 ), ylim = c( .005, 100 ),
 xlab = "average normalized read count", ylab = "squared coefficient of variation (CV^2)" )
axis( 1, 10^(-1:5), c( "0.1", "1", "10", "100", "1000",
 expression(10^4), expression(10^5) ) )
axis( 2, 10^(-2:2), c( "0.01", "0.1", "1", "10" ,"100"), las=2 )
abline( h=10^(-2:1), v=10^(-1:5), col="#D0D0D0", lwd=2 )
# Plot the plant genes, use a different color if they are highly variable
points( meansMmus, cv2Mmus, pch=20, cex=.2,
 col = ifelse( padjA < .1, "#C0007090", "#70500040" ) )
# Add the technical noise fit, as before
xg <- 10^seq( -2, 6, length.out=1000 )

lines( xg, coefficients(fitA)["a1tilde"] / xg + coefficients(fitA)["a0"], col="#FF000080", lwd=3 )
# Add a curve showing the expectation for the chosen biological CV^2 thershold
lines( xg, psia1thetaA/xg + coefficients(fitA)["a0"] + minBiolDisp,
 lty="dashed", col="#C0007090", lwd=3 )
# Add the normalised ERCC points
points( meansERCC, cv2ERCC, pch=20, cex=1, col="#0060B8A0" )



In [ ]:
# Note that the applied default model fails to fit this data!!